{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[32m2024-08-29 10:55:36.486\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mbyzerllm.utils.connect_ray\u001b[0m:\u001b[36mconnect_cluster\u001b[0m:\u001b[36m48\u001b[0m - \u001b[1mJDK 21 will be used (/Users/allwefantasy/.auto-coder/jdk-21.0.2.jdk/Contents/Home)...\u001b[0m\n",
      "2024-08-29 10:55:36,529\tINFO worker.py:1564 -- Connecting to existing Ray cluster at address: 127.0.0.1:6379...\n",
      "2024-08-29 10:55:36,530\tINFO worker.py:1582 -- Calling ray.init() again after it has already been called.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[\n",
      "    {\"start_line\": 21, \"end_line\": 22},\n",
      "    {\"start_line\": 29, \"end_line\": 36},\n",
      "    {\"start_line\": 49, \"end_line\": 50},\n",
      "    {\"start_line\": 52, \"end_line\": 54}\n",
      "]\n",
      "20 - 21 : 文本:GMP基本要求\n",
      "28 - 35 : \"2、机构与人员\",\"5条\",\"3、机构与人员\",\"4节22条\"\n",
      "\"3、厂房与设施\",\"23条\",\"4、厂房与设施\",\"5节33条\"\n",
      "\"4、设备\",\"7条\",\"5、设备\",\"6节31条\"\n",
      "\"5、物料\",\"10条\",\"6、物料与产品\",\"7节36条\"\n",
      "\"6、卫生\",\"9条\",\"\",\"\"\n",
      "\"7、验证\",\"4条\",\"7、确认与验证\",\"12条\"\n",
      "\"8、文件\",\"5条\",\"8、文件管理\",\"6节34条\"\n",
      "48 - 49 : 文本:药品GMP基本要求\n",
      "51 - 53 : #/Users/allwefantasy/Downloads/docs/新版GMP解读.pptx#395\n",
      "文本:主要修订内容\n",
      "文本:GMP基本要求\n",
      "\"2、机构与人员\",\"5条\",\"3、机构与人员\",\"4节22条\"\n",
      "\"3、厂房与设施\",\"23条\",\"4、厂房与设施\",\"5节33条\"\n",
      "\"4、设备\",\"7条\",\"5、设备\",\"6节31条\"\n",
      "\"5、物料\",\"10条\",\"6、物料与产品\",\"7节36条\"\n",
      "\"6、卫生\",\"9条\",\"\",\"\"\n",
      "\"7、验证\",\"4条\",\"7、确认与验证\",\"12条\"\n",
      "\"8、文件\",\"5条\",\"8、文件管理\",\"6节34条\"\n",
      "文本:药品GMP基本要求\n",
      "#/Users/allwefantasy/Downloads/docs/新版GMP解读.pptx#395\n",
      "文本:主要修订内容\n"
     ]
    }
   ],
   "source": [
    "from typing import List, Dict\n",
    "from autocoder.common import AutoCoderArgs, SourceCode\n",
    "from byzerllm.utils.client.code_utils import extract_code\n",
    "import json\n",
    "from loguru import logger\n",
    "\n",
    "import byzerllm\n",
    "from byzerllm import ByzerLLM\n",
    "\n",
    "# Initialize ByzerLLM\n",
    "llm = ByzerLLM.from_default_model(\"deepseek_chat\")\n",
    "\n",
    "@byzerllm.prompt()\n",
    "def extract_relevance_range_from_docs_with_conversation(\n",
    "    conversations: List[Dict[str, str]], documents: List[str]\n",
    ") -> str:\n",
    "    \"\"\"\n",
    "    根据提供的文档和对话历史提取相关信息范围。\n",
    "\n",
    "    输入:\n",
    "    1. 文档内容:\n",
    "    {% for doc in documents %}\n",
    "    {{ doc }}\n",
    "    {% endfor %}\n",
    "\n",
    "    2. 对话历史:\n",
    "    {% for msg in conversations %}\n",
    "    <{{ msg.role }}>: {{ msg.content }}\n",
    "    {% endfor %}\n",
    "\n",
    "    任务:\n",
    "    1. 分析最后一个用户问题及其上下文。\n",
    "    2. 在文档中找出与问题相关的一个或多个重要信息段。\n",
    "    3. 对每个相关信息段，确定其起始行号(start_line)和结束行号(end_line)。\n",
    "    4. 信息段数量不超过4个。\n",
    "\n",
    "    输出要求:\n",
    "    1. 返回一个JSON数组，每个元素包含\"start_line\"和\"end_line\"。\n",
    "    2. start_line和end_line必须是整数，表示文档中的行号。\n",
    "    3. 行号从1开始计数。\n",
    "    4. 如果没有相关信息，返回空数组[]。\n",
    "\n",
    "    输出格式:\n",
    "    严格的JSON数组，不包含其他文字或解释。\n",
    "\n",
    "    示例:\n",
    "    1.  文档：\n",
    "        1 这是这篇动物科普文。\n",
    "        2 大象是陆地上最大的动物之一。\n",
    "        3 它们生活在非洲和亚洲。\n",
    "        问题：大象生活在哪里？\n",
    "        返回：[{\"start_line\": 2, \"end_line\": 3}]\n",
    "\n",
    "    2.  文档：\n",
    "        1 地球是太阳系第三行星，\n",
    "        2 有海洋、沙漠，温度适宜，\n",
    "        3 是已知唯一有生命的星球。\n",
    "        4 太阳则是太阳系的唯一恒心。\n",
    "        问题：地球的特点是什么？\n",
    "        返回：[{\"start_line\": 1, \"end_line\": 3}]\n",
    "\n",
    "    3.  文档：\n",
    "        1 苹果富含维生素。\n",
    "        2 香蕉含有大量钾元素。\n",
    "        问题：橙子的特点是什么？\n",
    "        返回：[]        \n",
    "    \"\"\"  \n",
    "\n",
    "# result = extract_relevance_range_from_docs_with_conversation.with_llm(llm).run(conversations=conversations, documents=documents)\n",
    "\n",
    "\n",
    "# {\"conversation\":conversations, \"doc\":[doc.source_code]}\n",
    "conversations = None\n",
    "documents = None\n",
    "with open(\"/tmp/rag.json\", \"r\") as f:\n",
    "    lines = f.read().split(\"\\n\")\n",
    "    for i, line in enumerate(lines):\n",
    "        if line:\n",
    "            v = json.loads(line)\n",
    "            conversations = v[\"conversation\"]\n",
    "            documents = v[\"doc\"]\n",
    "\n",
    "def process_range_doc(doc, max_retries=3):\n",
    "    for attempt in range(max_retries):\n",
    "        content = \"\"\n",
    "        try:\n",
    "            source_code_with_line_number = \"\"          \n",
    "            source_code_lines = doc.source_code.split(\"\\n\")  \n",
    "            for idx,line in enumerate(source_code_lines):\n",
    "                source_code_with_line_number += f\"{idx+1} {line}\\n\"                \n",
    "\n",
    "            extracted_info = extract_relevance_range_from_docs_with_conversation.with_llm(\n",
    "                llm\n",
    "            ).run(\n",
    "                conversations, [source_code_with_line_number]\n",
    "            )\n",
    "                                                                        \n",
    "            json_str = extract_code(extracted_info)[0][1]\n",
    "\n",
    "            print(json_str)\n",
    "            json_objs = json.loads(json_str)                                    \n",
    "                                                \n",
    "            for json_obj in json_objs:\n",
    "                start_line = json_obj[\"start_line\"] - 1\n",
    "                end_line = json_obj[\"end_line\"] - 1 \n",
    "                chunk = \"\\n\".join(source_code_lines[start_line:end_line])\n",
    "                content +=  chunk + \"\\n\"\n",
    "                print(f\"{start_line} - {end_line} : {chunk}\")\n",
    "            \n",
    "            return SourceCode(\n",
    "                module_name=doc.module_name, source_code=content.strip()\n",
    "            )\n",
    "        except Exception as e:\n",
    "            e.print_exc() \n",
    "            if attempt < max_retries - 1:\n",
    "                logger.warning(f\"Error processing doc {doc.module_name}, retrying... (Attempt {attempt + 1}) attempts: {str(e)}\")                                                       \n",
    "            else:\n",
    "                logger.error(f\"Failed to process doc {doc.module_name} after {max_retries} attempts: {str(e)}\")                \n",
    "                return SourceCode(\n",
    "                module_name=doc.module_name, source_code=content.strip()\n",
    "            )\n",
    "\n",
    "# print(documents[0])\n",
    "m = process_range_doc(SourceCode(\n",
    "                module_name=\"test\", source_code=documents[0]\n",
    "            ))\n",
    "\n",
    "\n",
    "print(m.source_code)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
